import easeml as es ## import just one package
### just in case you are not able to view plots
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import cufflinks
cufflinks.go_offline(connected=True)
from plotly.offline import iplot, init_notebook_mode
# es.help() prints all function available and usage
df = es.importdata('adv-housing.csv')
es.quick_ml(df,'SalePrice','r')
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 1460
No of columns : 81
No of Numerical columns: 38
No of Categorical columns: 43
##################################################
Total Missing values : 6965
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 Id 0 1460 1460 Numerical
1 MSSubClass 0 1460 15 Numerical
2 MSZoning 0 1460 5 Categorical
3 LotFrontage 259 1460 110 Numerical
4 LotArea 0 1460 1073 Numerical
.. ... ... ... ... ...
76 MoSold 0 1460 12 Numerical
77 YrSold 0 1460 5 Numerical
78 SaleType 0 1460 9 Categorical
79 SaleCondition 0 1460 6 Categorical
80 SalePrice 0 1460 663 Numerical
[81 rows x 5 columns]
Null value summary:
Total Percent
PoolQC 1453 99.520548
MiscFeature 1406 96.301370
Alley 1369 93.767123
Fence 1179 80.753425
FireplaceQu 690 47.260274
LotFrontage 259 17.739726
GarageCond 81 5.547945
GarageType 81 5.547945
GarageYrBlt 81 5.547945
GarageFinish 81 5.547945
GarageQual 81 5.547945
BsmtExposure 38 2.602740
BsmtFinType2 38 2.602740
BsmtFinType1 37 2.534247
BsmtCond 37 2.534247
BsmtQual 37 2.534247
MasVnrArea 8 0.547945
MasVnrType 8 0.547945
Electrical 1 0.068493
columns dropped Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
'LotFrontage'],
dtype='object')
columns dropped
['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage']
[]
Dropped null values from columns:
['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']
columns label encoded
Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
'PavedDrive', 'SaleType', 'SaleCondition'],
dtype='object')
Detailed Executing with RandomForest ..
-----------------------------------------
************* Model Results *************
-----------------------------------------
R2 score 0.6273971463490331
MSE score: 1607856695.0580575
RMSE score score: 40098.088421495326
-----------------------------------------
Executing RFE
Please wait.........
### RFE selected columns:
Index(['LotArea', 'OverallQual', 'YearBuilt', 'BsmtFinSF1', 'TotalBsmtSF',
'1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageArea'],
dtype='object')
RFE Selected Features Please wait Training-Testing with all models.. Done with LinearRegression Done with RandomForestRegression [15:36:16] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. Done with XGBoostRegressor Done with LGBoostRegressor Done with AdaBoostRegressor Done with SupportVectorMachine Done with GradientBoostingRegression
| Model | R^2 score | Root Mean Squared Error | Root Mean Squared Log Error | Mean Squared Error | |
|---|---|---|---|---|---|
| 1 | RandomForestRegression | 0.88781 | 16097.1 | 0.106443 | 2.59116e+08 |
| 2 | XGBoostRegressor | 0.886151 | 16215.7 | 0.0977401 | 2.62949e+08 |
| 6 | GradientBoostingRegression | 0.883387 | 16411.4 | 0.0990406 | 2.69333e+08 |
| 4 | AdaBoostRegressor | 0.837192 | 19391.4 | 0.108653 | 3.76025e+08 |
| 0 | LinearRegression | 0.759426 | 23571.9 | 0.191409 | 5.55637e+08 |
| 3 | LGBoostRegressor | 0.244415 | 41774.7 | 0.24101 | 1.74512e+09 |
| 5 | SupportVectorMachine | -0.0113428 | 48330.4 | 0.263462 | 2.33583e+09 |
df1 = es.importdata('train_split.csv')
df1 = es.dropcolumns(df1,'batch_enrolled','member_id')
## no hazzle to clean data or do anything just pass target and dataframe use quick_ml fucntion in easeml
## just pass dataframe, targetcolumn and flag(r-regression/c=classification)
es.quick_ml(df1,'loan_status','c')
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 63999
No of columns : 43
No of Numerical columns: 26
No of Categorical columns: 17
##################################################
Total Missing values : 276727
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 loan_amnt 0 63999 1273 Numerical
1 funded_amnt 0 63999 1275 Numerical
2 funded_amnt_inv 0 63999 1965 Numerical
3 term 0 63999 2 Categorical
4 int_rate 0 63999 459 Numerical
5 grade 0 63999 7 Categorical
6 sub_grade 0 63999 35 Categorical
7 emp_title 3826 63999 31609 Categorical
8 emp_length 3324 63999 11 Categorical
9 home_ownership 0 63999 6 Categorical
10 annual_inc 0 63999 6368 Numerical
11 verification_status 0 63999 3 Categorical
12 pymnt_plan 0 63999 1 Categorical
13 desc 54849 63999 8757 Categorical
14 purpose 0 63999 14 Categorical
15 title 13 63999 6409 Categorical
16 zip_code 0 63999 866 Categorical
17 addr_state 0 63999 51 Categorical
18 dti 0 63999 3943 Numerical
19 delinq_2yrs 0 63999 19 Numerical
20 inq_last_6mths 0 63999 15 Numerical
21 mths_since_last_delinq 32831 63999 103 Numerical
22 mths_since_last_record 54349 63999 121 Numerical
23 open_acc 0 63999 54 Numerical
24 pub_rec 0 63999 14 Numerical
25 revol_bal 0 63999 30608 Numerical
26 revol_util 29 63999 1090 Numerical
27 total_acc 0 63999 101 Numerical
28 initial_list_status 0 63999 2 Categorical
29 total_rec_int 0 63999 50744 Numerical
30 total_rec_late_fee 0 63999 600 Numerical
31 recoveries 0 63999 1754 Numerical
32 collection_recovery_fee 0 63999 1653 Numerical
33 collections_12_mths_ex_med 8 63999 5 Numerical
34 mths_since_last_major_derog 48155 63999 148 Numerical
35 application_type 0 63999 2 Categorical
36 verification_status_joint 63971 63999 3 Categorical
37 last_week_pay 0 63999 87 Categorical
38 acc_now_delinq 0 63999 5 Numerical
39 tot_coll_amt 5124 63999 2579 Numerical
40 tot_cur_bal 5124 63999 52207 Numerical
41 total_rev_hi_lim 5124 63999 3617 Numerical
42 loan_status 0 63999 2 Numerical
Null value summary:
Total Percent
verification_status_joint 63971 99.956249
desc 54849 85.702902
mths_since_last_record 54349 84.921639
mths_since_last_major_derog 48155 75.243363
mths_since_last_delinq 32831 51.299239
tot_cur_bal 5124 8.006375
tot_coll_amt 5124 8.006375
total_rev_hi_lim 5124 8.006375
emp_title 3826 5.978218
emp_length 3324 5.193831
revol_util 29 0.045313
title 13 0.020313
collections_12_mths_ex_med 8 0.012500
columns dropped Index(['verification_status_joint', 'desc', 'mths_since_last_record',
'mths_since_last_major_derog', 'mths_since_last_delinq'],
dtype='object')
columns dropped
['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq']
[]
Dropped null values from columns:
['emp_title', 'emp_length', 'title', 'revol_util', 'collections_12_mths_ex_med', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']
columns label encoded
Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',
'title', 'zip_code', 'addr_state', 'initial_list_status',
'application_type', 'last_week_pay'],
dtype='object')
Detailed Executing with RandomForest ..
-----------------------------------------
************* Model Results *************
-----------------------------------------
F1 Score : 0.7112312634261447
Report:
precision recall f1-score support
0 0.89 0.68 0.77 13359
1 0.34 0.67 0.45 3238
accuracy 0.68 16597
macro avg 0.62 0.68 0.61 16597
weighted avg 0.79 0.68 0.71 16597
-----------------------------------------
Executing RFE
Please wait.........
### RFE selected columns:
Index(['funded_amnt_inv', 'term', 'int_rate', 'sub_grade', 'title', 'dti',
'initial_list_status', 'total_rec_int', 'recoveries', 'last_week_pay'],
dtype='object')
RFE Selected Features
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
| Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
|---|---|---|---|---|---|
| 3 | LGBoost Classifier | 0.923457 | 0.75 | 0.933333 | [[26 0] [ 2 2]] |
| 0 | LogisticRegression | 0.872727 | 0.625 | 0.9 | [[26 0] [ 3 1]] |
| 1 | RandomForestClassifier | 0.872727 | 0.625 | 0.9 | [[26 0] [ 3 1]] |
| 2 | XGBoost Classifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
| 4 | AdaBoost Classifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
| 5 | GradientBoostingClassifier | 0.804762 | 0.5 | 0.866667 | [[26 0] [ 4 0]] |
df = es.importdata('Housing.csv')
es.quick_ml(df,'price','r')
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 545
No of columns : 13
No of Numerical columns: 6
No of Categorical columns: 7
##################################################
Total Missing values : 0
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 price 0 545 219 Numerical
1 area 0 545 284 Numerical
2 bedrooms 0 545 6 Numerical
3 bathrooms 0 545 4 Numerical
4 stories 0 545 4 Numerical
5 mainroad 0 545 2 Categorical
6 guestroom 0 545 2 Categorical
7 basement 0 545 2 Categorical
8 hotwaterheating 0 545 2 Categorical
9 airconditioning 0 545 2 Categorical
10 parking 0 545 4 Numerical
11 prefarea 0 545 2 Categorical
12 furnishingstatus 0 545 3 Categorical
Null value summary:
Empty DataFrame
Columns: [Total, Percent]
Index: []
columns dropped Index([], dtype='object')
columns dropped
[]
[]
columns label encoded
Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
'airconditioning', 'prefarea', 'furnishingstatus'],
dtype='object')
Detailed Executing with RandomForest ..
-----------------------------------------
************* Model Results *************
-----------------------------------------
R2 score 0.5499416877756778
MSE score: 1938127160645.6133
RMSE score score: 1392166.3552340337
-----------------------------------------
Executing RFE
Please wait.........
### RFE selected columns:
Index(['area', 'bedrooms', 'bathrooms', 'stories', 'basement',
'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
'furnishingstatus'],
dtype='object')
RFE Selected Features Please wait Training-Testing with all models.. Done with LinearRegression Done with RandomForestRegression [15:48:34] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. Done with XGBoostRegressor Done with LGBoostRegressor Done with AdaBoostRegressor Done with SupportVectorMachine Done with GradientBoostingRegression
| Model | R^2 score | Root Mean Squared Error | Root Mean Squared Log Error | Mean Squared Error | |
|---|---|---|---|---|---|
| 0 | LinearRegression | 0.536614 | 1.19452e+06 | 0.223221 | 1.42688e+12 |
| 2 | XGBoostRegressor | 0.498153 | 1.24311e+06 | 0.224521 | 1.54531e+12 |
| 6 | GradientBoostingRegression | 0.485068 | 1.25921e+06 | 0.229515 | 1.5856e+12 |
| 4 | AdaBoostRegressor | 0.4731 | 1.27376e+06 | 0.240908 | 1.62246e+12 |
| 1 | RandomForestRegression | 0.437476 | 1.31611e+06 | 0.246043 | 1.73215e+12 |
| 3 | LGBoostRegressor | -0.0563305 | 1.80353e+06 | 0.318165 | 3.2527e+12 |
| 5 | SupportVectorMachine | -0.230739 | 1.94673e+06 | 0.341145 | 3.78975e+12 |
df = es.importdata('TrainDataset.csv')
es.quick_ml(df,'popularity','c')
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 1302
No of columns : 7
No of Numerical columns: 7
No of Categorical columns: 0
##################################################
Total Missing values : 0
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 buying_price 0 1302 4 Numerical
1 maintainence_cost 0 1302 4 Numerical
2 number_of_doors 0 1302 4 Numerical
3 number_of_seats 0 1302 3 Numerical
4 luggage_boot_size 0 1302 3 Numerical
5 safety_rating 0 1302 3 Numerical
6 popularity 0 1302 4 Numerical
Null value summary:
Empty DataFrame
Columns: [Total, Percent]
Index: []
columns dropped Index([], dtype='object')
columns dropped
[]
[]
Detailed Executing with RandomForest ..
-----------------------------------------
************* Model Results *************
-----------------------------------------
F1 Score : 0.9623893476106086
Report:
precision recall f1-score support
1 1.00 0.96 0.98 277
2 0.90 0.96 0.93 94
3 0.86 1.00 0.92 12
4 0.80 1.00 0.89 8
accuracy 0.96 391
macro avg 0.89 0.98 0.93 391
weighted avg 0.96 0.96 0.96 391
-----------------------------------------
Executing RFE
Please wait.........
### RFE selected columns:
Index(['buying_price', 'maintainence_cost', 'number_of_doors',
'number_of_seats', 'luggage_boot_size', 'safety_rating'],
dtype='object')
RFE Selected Features
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
| Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
|---|---|---|---|---|---|
| 1 | RandomForestClassifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
| 3 | LGBoost Classifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
| 2 | XGBoost Classifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
| 5 | GradientBoostingClassifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
| 4 | AdaBoost Classifier | 0.898222 | 0.865079 | 0.9 | [[20 1] [ 2 7]] |
| 0 | LogisticRegression | 0.822222 | 0.753968 | 0.833333 | [[20 1] [ 4 5]] |
df1 = es.importdata('train_split.csv')
es.info(df1)
es.box_hist_plot(df1,'emp_length')
df1 = es.drop_columns(df1,20,'member_id','pymnt_plan','batch_enrolled')
es.missingdata(df1)
es.extract_number(df1,'emp_length')
df1 = es.fillnulls(df1,'unknown','emp_title','title')
es.box_hist_plot(df1,'revol_util','tot_cur_bal','total_rev_hi_lim')
df1 = es.dropcolumns(df1,'tot_coll_amt')
df1 = es.fillnulls(df1,'mean','revol_util','tot_cur_bal','total_rev_hi_lim')
df2 = es.label_encode(df1)
es.corr_heatmap(df2,'interactive')
es.dropcolumns(df2,'funded_amnt_inv','funded_amnt','collections_12_mths_ex_med')
es.showbias(df1,'loan_status')
X = df2.drop('loan_status',axis=1)
y = df2['loan_status']
es.stat_models(y,X)
es.VIF(df2)
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=.7,random_state=101)
print('\n *************** Random Forest ***************\n')
y_pred, rfc = es.randomforest_classifier(X_train,y_train,X_test,y_test)
es.classification_result(y_test,y_pred)
es.roc_curve_graph(y_test, y_pred)
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 63999
No of columns : 45
No of Numerical columns: 27
No of Categorical columns: 18
##################################################
Total Missing values : 286991
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 member_id 0 63999 63999 Numerical
1 loan_amnt 0 63999 1273 Numerical
2 funded_amnt 0 63999 1275 Numerical
3 funded_amnt_inv 0 63999 1965 Numerical
4 term 0 63999 2 Categorical
5 batch_enrolled 10264 63999 102 Categorical
6 int_rate 0 63999 459 Numerical
7 grade 0 63999 7 Categorical
8 sub_grade 0 63999 35 Categorical
9 emp_title 3826 63999 31609 Categorical
10 emp_length 3324 63999 11 Categorical
11 home_ownership 0 63999 6 Categorical
12 annual_inc 0 63999 6368 Numerical
13 verification_status 0 63999 3 Categorical
14 pymnt_plan 0 63999 1 Categorical
15 desc 54849 63999 8757 Categorical
16 purpose 0 63999 14 Categorical
17 title 13 63999 6409 Categorical
18 zip_code 0 63999 866 Categorical
19 addr_state 0 63999 51 Categorical
20 dti 0 63999 3943 Numerical
21 delinq_2yrs 0 63999 19 Numerical
22 inq_last_6mths 0 63999 15 Numerical
23 mths_since_last_delinq 32831 63999 103 Numerical
24 mths_since_last_record 54349 63999 121 Numerical
25 open_acc 0 63999 54 Numerical
26 pub_rec 0 63999 14 Numerical
27 revol_bal 0 63999 30608 Numerical
28 revol_util 29 63999 1090 Numerical
29 total_acc 0 63999 101 Numerical
30 initial_list_status 0 63999 2 Categorical
31 total_rec_int 0 63999 50744 Numerical
32 total_rec_late_fee 0 63999 600 Numerical
33 recoveries 0 63999 1754 Numerical
34 collection_recovery_fee 0 63999 1653 Numerical
35 collections_12_mths_ex_med 8 63999 5 Numerical
36 mths_since_last_major_derog 48155 63999 148 Numerical
37 application_type 0 63999 2 Categorical
38 verification_status_joint 63971 63999 3 Categorical
39 last_week_pay 0 63999 87 Categorical
40 acc_now_delinq 0 63999 5 Numerical
41 tot_coll_amt 5124 63999 2579 Numerical
42 tot_cur_bal 5124 63999 52207 Numerical
43 total_rev_hi_lim 5124 63999 3617 Numerical
44 loan_status 0 63999 2 Numerical
Null value summary:
Total Percent
verification_status_joint 63971 99.956249
desc 54849 85.702902
mths_since_last_record 54349 84.921639
mths_since_last_major_derog 48155 75.243363
mths_since_last_delinq 32831 51.299239
batch_enrolled 10264 16.037751
tot_cur_bal 5124 8.006375
tot_coll_amt 5124 8.006375
total_rev_hi_lim 5124 8.006375
emp_title 3826 5.978218
emp_length 3324 5.193831
revol_util 29 0.045313
title 13 0.020313
collections_12_mths_ex_med 8 0.012500
columns dropped Index(['verification_status_joint', 'desc', 'mths_since_last_record',
'mths_since_last_major_derog', 'mths_since_last_delinq'],
dtype='object')
columns dropped
['verification_status_joint', 'desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq']
['member_id', 'pymnt_plan', 'batch_enrolled']
columns inputed with : unknown are ['emp_title', 'title'] unknown Filled inplace of NaNs unknown Filled inplace of NaNs
columns inputed with : mean are ['revol_util', 'tot_cur_bal', 'total_rev_hi_lim']
revol_util filled with: 55.045863060809744
tot_cur_bal filled with: 139812.53492993632
total_rev_hi_lim filled with: 32223.922038216562
columns label encoded
Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
'home_ownership', 'verification_status', 'purpose', 'title', 'zip_code',
'addr_state', 'initial_list_status', 'application_type',
'last_week_pay'],
dtype='object')
************************************************************
OLS Regression Results
=======================================================================================
Dep. Variable: loan_status R-squared (uncentered): 0.357
Model: OLS Adj. R-squared (uncentered): 0.357
Method: Least Squares F-statistic: 1110.
Date: Tue, 17 Mar 2020 Prob (F-statistic): 0.00
Time: 15:52:24 Log-Likelihood: -30665.
No. Observations: 63999 AIC: 6.139e+04
Df Residuals: 63967 BIC: 6.168e+04
Df Model: 32
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
loan_amnt -2.078e-06 2.67e-07 -7.784 0.000 -2.6e-06 -1.55e-06
term -0.0601 0.004 -14.037 0.000 -0.069 -0.052
int_rate 0.0846 0.001 64.946 0.000 0.082 0.087
grade -0.0181 0.005 -3.321 0.001 -0.029 -0.007
sub_grade -0.0515 0.001 -34.544 0.000 -0.054 -0.049
emp_title -2.238e-06 1.65e-07 -13.588 0.000 -2.56e-06 -1.92e-06
emp_length -0.0013 0.001 -2.548 0.011 -0.002 -0.000
home_ownership -0.0027 0.001 -2.926 0.003 -0.004 -0.001
annual_inc -1.405e-07 3.62e-08 -3.884 0.000 -2.11e-07 -6.96e-08
verification_status -0.0059 0.002 -2.805 0.005 -0.010 -0.002
purpose -0.0040 0.001 -5.099 0.000 -0.005 -0.002
title 3.48e-05 1.56e-06 22.313 0.000 3.17e-05 3.79e-05
zip_code -3.354e-06 5.81e-06 -0.577 0.564 -1.47e-05 8.04e-06
addr_state -0.0005 0.000 -4.756 0.000 -0.001 -0.000
dti -0.0051 0.000 -23.481 0.000 -0.005 -0.005
delinq_2yrs -0.0229 0.002 -12.517 0.000 -0.027 -0.019
inq_last_6mths 0.0232 0.002 14.117 0.000 0.020 0.026
open_acc -0.0076 0.000 -17.411 0.000 -0.008 -0.007
pub_rec -0.0394 0.003 -13.728 0.000 -0.045 -0.034
revol_bal -2.196e-07 1.51e-07 -1.451 0.147 -5.16e-07 7.71e-08
revol_util -0.0009 8.26e-05 -10.311 0.000 -0.001 -0.001
total_acc 0.0033 0.000 17.411 0.000 0.003 0.004
initial_list_status -0.1331 0.003 -40.528 0.000 -0.140 -0.127
total_rec_int -2.848e-06 1.04e-06 -2.740 0.006 -4.89e-06 -8.11e-07
total_rec_late_fee -0.0006 0.000 -1.704 0.088 -0.001 9.44e-05
recoveries -0.0001 6.84e-06 -20.976 0.000 -0.000 -0.000
collection_recovery_fee 0.0004 4.18e-05 8.410 0.000 0.000 0.000
application_type -0.0415 0.074 -0.561 0.575 -0.186 0.103
last_week_pay -0.0004 6.09e-05 -7.363 0.000 -0.001 -0.000
acc_now_delinq -0.0310 0.020 -1.588 0.112 -0.069 0.007
tot_cur_bal 1.139e-07 1.46e-08 7.819 0.000 8.54e-08 1.43e-07
total_rev_hi_lim 2.649e-07 1.11e-07 2.381 0.017 4.68e-08 4.83e-07
==============================================================================
Omnibus: 7448.626 Durbin-Watson: 2.001
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10200.031
Skew: 0.970 Prob(JB): 0.00
Kurtosis: 2.750 Cond. No. 1.05e+07
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.05e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
******** VIF *********
feature vif
4 sub_grade 153.09
2 int_rate 147.28
3 grade 61.36
17 open_acc 12.90
21 total_acc 11.97
31 total_rev_hi_lim 10.39
20 revol_util 10.31
0 loan_amnt 8.65
14 dti 7.85
19 revol_bal 7.45
11 title 5.76
28 last_week_pay 5.39
8 annual_inc 4.71
5 emp_title 4.33
7 home_ownership 4.27
12 zip_code 3.89
30 tot_cur_bal 3.65
13 addr_state 3.49
23 total_rec_int 3.40
9 verification_status 3.15
10 purpose 3.00
25 recoveries 2.87
26 collection_recovery_fee 2.78
1 term 2.32
22 initial_list_status 2.25
6 emp_length 2.19
16 inq_last_6mths 1.72
32 loan_status 1.56
15 delinq_2yrs 1.20
18 pub_rec 1.18
24 total_rec_late_fee 1.03
29 acc_now_delinq 1.03
27 application_type 1.00
*************** Random Forest ***************
-----------------------------------------
************* Model Results *************
-----------------------------------------
F1 Score : 0.781688137529659
Report:
precision recall f1-score support
0 0.87 0.82 0.85 14622
1 0.52 0.62 0.57 4578
accuracy 0.78 19200
macro avg 0.70 0.72 0.71 19200
weighted avg 0.79 0.78 0.78 19200
-----------------------------------------
y_pred, model = es.classification(X,y)
es.classification_result(y_test,y_pred)
es.roc_curve_graph(y_test, y_pred)
es.quick_pred(X,y,'c')
########## MENU ############## 1. Logisitic Regression 2. Random Forest 3. XGB Classifier 4. LGB CLassifier 5. Gradient Boosting Classifier 6. AdaBoost Classifier 99. For all models Input No, which you want:6 Predicting with Ada Boost CLassifier
Model used: AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
n_estimators=100, random_state=None)
Done.
-----------------------------------------
************* Model Results *************
-----------------------------------------
F1 Score : 0.6576365174253705
Report:
precision recall f1-score support
0 0.76 0.87 0.81 14622
1 0.23 0.12 0.16 4578
accuracy 0.69 19200
macro avg 0.50 0.50 0.49 19200
weighted avg 0.63 0.69 0.66 19200
-----------------------------------------
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
| Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
|---|---|---|---|---|---|
| 3 | LGBoost Classifier | 0.926282 | 0.8 | 0.933333 | [[25 0] [ 2 3]] |
| 5 | GradientBoostingClassifier | 0.926282 | 0.8 | 0.933333 | [[25 0] [ 2 3]] |
| 2 | XGBoost Classifier | 0.881402 | 0.7 | 0.9 | [[25 0] [ 3 2]] |
| 4 | AdaBoost Classifier | 0.881402 | 0.7 | 0.9 | [[25 0] [ 3 2]] |
| 1 | RandomForestClassifier | 0.852564 | 0.68 | 0.866667 | [[24 1] [ 3 2]] |
| 0 | LogisticRegression | 0.757576 | 0.5 | 0.833333 | [[25 0] [ 5 0]] |
## car popularity dataset
df = es.importdata('TrainDataset.csv')
es.info(df)
X = df.drop('popularity',axis=1)
y = df['popularity']
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)
es.stat_models(y,X)
es.VIF(df)
y_pred , model = es.classification(X,y)
print('\n\n*** Start Hypertuning ***\n')
y_pred,grid = es.grid_search(model,'quick',X_train,y_train,X_test,y_test)
print('\n\n*** Grid Search CV Result ***\n')
es.classification_result(y_test,y_pred)
Dataframe Imported Successfully
*** Dataset Infomarion ***
##################################################
No of Rows : 1302
No of columns : 7
No of Numerical columns: 7
No of Categorical columns: 0
##################################################
Total Missing values : 0
##################################################
Summary of DataFrame:
Column Name Nulls/NaN outof Unique Type of Columns
0 buying_price 0 1302 4 Numerical
1 maintainence_cost 0 1302 4 Numerical
2 number_of_doors 0 1302 4 Numerical
3 number_of_seats 0 1302 3 Numerical
4 luggage_boot_size 0 1302 3 Numerical
5 safety_rating 0 1302 3 Numerical
6 popularity 0 1302 4 Numerical
************************************************************
OLS Regression Results
=======================================================================================
Dep. Variable: popularity R-squared (uncentered): 0.894
Model: OLS Adj. R-squared (uncentered): 0.894
Method: Least Squares F-statistic: 1825.
Date: Tue, 17 Mar 2020 Prob (F-statistic): 0.00
Time: 16:27:41 Log-Likelihood: -915.04
No. Observations: 1302 AIC: 1842.
Df Residuals: 1296 BIC: 1873.
Df Model: 6
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
buying_price -0.1362 0.012 -11.721 0.000 -0.159 -0.113
maintainence_cost -0.1092 0.011 -9.546 0.000 -0.132 -0.087
number_of_doors 0.0537 0.011 5.102 0.000 0.033 0.074
number_of_seats 0.2138 0.009 22.528 0.000 0.195 0.232
luggage_boot_size 0.1288 0.015 8.385 0.000 0.099 0.159
safety_rating 0.3725 0.015 24.433 0.000 0.343 0.402
==============================================================================
Omnibus: 281.511 Durbin-Watson: 2.017
Prob(Omnibus): 0.000 Jarque-Bera (JB): 701.940
Skew: 1.150 Prob(JB): 3.76e-153
Kurtosis: 5.766 Cond. No. 8.35
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
******** VIF *********
feature vif
3 number_of_seats 10.14
6 popularity 9.45
5 safety_rating 8.54
2 number_of_doors 8.28
4 luggage_boot_size 6.26
0 buying_price 6.20
1 maintainence_cost 5.73
########## MENU ##############
1. Logisitic Regression
2. Random Forest
3. XGB Classifier
4. LGB CLassifier
5. Gradient Boosting Classifier
6. AdaBoost Classifier
99. For all models
Input No, which you want:2
Predicting with Random Forest Classifier
Model used: RandomForestClassifier(bootstrap=True, class_weight='balanced',
criterion='gini', max_depth=8, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None, oob_score=False,
random_state=101, verbose=0, warm_start=False)
Done.
*** Start Hypertuning ***
Performing GridSearchCV
Please wait............
Done ¯\_(ツ)_/¯
best parameters:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
criterion='gini', max_depth=80, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=3,
min_samples_split=8, min_weight_fraction_leaf=0.0,
n_estimators=30, n_jobs=None, oob_score=False,
random_state=101, verbose=0, warm_start=False)
*** Grid Search CV Result ***
-----------------------------------------
************* Model Results *************
-----------------------------------------
F1 Score : 0.939271732329321
Report:
precision recall f1-score support
1 1.00 0.94 0.97 277
2 0.86 0.90 0.88 94
3 0.67 1.00 0.80 12
4 0.62 1.00 0.76 8
accuracy 0.94 391
macro avg 0.79 0.96 0.85 391
weighted avg 0.95 0.94 0.94 391
-----------------------------------------
es.quick_pred(X,y,'c') ## use quick pred instantly on clean data to get immidate result
Please wait Training-Testing with all models.. Done with LogisticRegression Done with RandomForestClassifier Done with XGBoost Classifier Done with LGBoost Classifier Done with AdaBoost Classifier Done with GradientBoostingClassifier
| Model | F1 score | AUC-ROC score | Accuracy | Confustion-Matrix | |
|---|---|---|---|---|---|
| 1 | RandomForestClassifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
| 3 | LGBoost Classifier | 1 | 1 | 1 | [[21 0] [ 0 9]] |
| 2 | XGBoost Classifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
| 5 | GradientBoostingClassifier | 0.967137 | 0.97619 | 0.966667 | [[20 1] [ 0 9]] |
| 4 | AdaBoost Classifier | 0.898222 | 0.865079 | 0.9 | [[20 1] [ 2 7]] |
| 0 | LogisticRegression | 0.822222 | 0.753968 | 0.833333 | [[20 1] [ 4 5]] |